Loading Packages

library(data.table) # Efficient Dataframe 
library(lubridate) # For Dates 
library(tidyverse) # Multiple Package for Useful Data wrangling
library(esquisse) # Intuitive plotting
library(plyr) # Data splitting
library(dplyr) # Data Wrangling
library(ggplot2) # Plot Graphs
library(naniar) # for NA exploration in Dataframe
library(plotly) # Make ggplot2 Dynamic
library(gridExtra) # Multiple Plot at once
library(RColorBrewer) # For Color Palette
library(rmdformats) # Theme of HTML
library(flextable) # Show Table
library(class) # K-NN
library(summarytools) # Beautiful and Efficient Summary for Dataset
library(pivottabler) # Pivot Table
library(naivebayes) # Naive Bayes Function
library(caret) # Confusion Matrix
library(leaps) # Exhaustive Search
library(forecast) # Predictions
library(neuralnet) # Neural Network
library(nnet) # Neural Network
library(manipulateWidget) # Plotly Combiner
library(rpart) # Regression Tree
library(rpart.plot) # Plotting Regression Tree

Dataset Preparation

Loading the dataset called “ToyotaCorolla.csv”

# Load the Dataset with Fread()
ToyotaDT <- fread("DATA/ToyotaCorolla.csv")

Quick Preview

# Preview of the Dataset
DT::datatable(head(ToyotaDT))

Dataset Description: The file ToyotaCorolla.csv contains data on used cars on sale during late summer of 2004 in the Netherlands. It has 1436 records containing detail on 38 attributes, including Price, Age, Kilometers, HP (Horse Power), and other specifications.

dfSummary(ToyotaDT, 
          plain.ascii  = FALSE, 
          style        = "grid", 
          graph.magnif = 0.75, 
          valid.col    = FALSE,
          tmp.img.dir  = "/tmp")

Data Frame Summary

ToyotaDT

Dimensions: 1436 x 39
Duplicates: 0

No Variable Stats / Values Freqs (% of Valid) Graph Missing
1 Id
[integer]
Mean (sd) : 721.6 (416.5)
min < med < max:
1 < 721.5 < 1442
IQR (CV) : 719.5 (0.6)
1436 distinct values 0
(0.0%)
2 Model
[character]
1. TOYOTA Corolla 1.6 16V HA
2. TOYOTA Corolla 1.3 16V HA
3. TOYOTA Corolla 1.6 16V LI
4. TOYOTA Corolla 1.6 16V LI
5. TOYOTA Corolla 1.6 16V SE
6. TOYOTA Corolla 1.4 16V VV
7. TOYOTA Corolla 1.3 16V LI
8. TOYOTA Corolla 1.6 16V VV
9. TOYOTA Corolla 1.6 16V WA
10. TOYOTA Corolla 1.6 16V VV
[ 362 others ]
107 ( 7.5%)
83 ( 5.8%)
79 ( 5.5%)
70 ( 4.9%)
43 ( 3.0%)
42 ( 2.9%)
35 ( 2.4%)
31 ( 2.2%)
28 ( 1.9%)
27 ( 1.9%)
891 (62.0%)
0
(0.0%)
3 Price
[integer]
Mean (sd) : 10730.8 (3627)
min < med < max:
4350 < 9900 < 32500
IQR (CV) : 3500 (0.3)
236 distinct values 0
(0.0%)
4 Age_08_04
[integer]
Mean (sd) : 55.9 (18.6)
min < med < max:
1 < 61 < 80
IQR (CV) : 26 (0.3)
77 distinct values 0
(0.0%)
5 Mfg_Month
[integer]
Mean (sd) : 5.5 (3.4)
min < med < max:
1 < 5 < 12
IQR (CV) : 5 (0.6)
12 distinct values 0
(0.0%)
6 Mfg_Year
[integer]
Mean (sd) : 1999.6 (1.5)
min < med < max:
1998 < 1999 < 2004
IQR (CV) : 3 (0)
1998 : 392 (27.3%)
1999 : 441 (30.7%)
2000 : 225 (15.7%)
2001 : 192 (13.4%)
2002 : 87 ( 6.1%)
2003 : 75 ( 5.2%)
2004 : 24 ( 1.7%)
0
(0.0%)
7 KM
[integer]
Mean (sd) : 68533.3 (37506.4)
min < med < max:
1 < 63389.5 < 243000
IQR (CV) : 44020.8 (0.5)
1263 distinct values 0
(0.0%)
8 Fuel_Type
[character]
1. CNG
2. Diesel
3. Petrol
17 ( 1.2%)
155 (10.8%)
1264 (88.0%)
0
(0.0%)
9 HP
[integer]
Mean (sd) : 101.5 (15)
min < med < max:
69 < 110 < 192
IQR (CV) : 20 (0.1)
12 distinct values 0
(0.0%)
10 Met_Color
[integer]
Min : 0
Mean : 0.7
Max : 1
0 : 467 (32.5%)
1 : 969 (67.5%)
0
(0.0%)
11 Color
[character]
1. Beige
2. Black
3. Blue
4. Green
5. Grey
6. Red
7. Silver
8. Violet
9. White
10. Yellow
3 ( 0.2%)
191 (13.3%)
283 (19.7%)
220 (15.3%)
301 (21.0%)
278 (19.4%)
122 ( 8.5%)
4 ( 0.3%)
31 ( 2.2%)
3 ( 0.2%)
0
(0.0%)
12 Automatic
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1356 (94.4%)
1 : 80 ( 5.6%)
0
(0.0%)
13 CC
[integer]
Mean (sd) : 1576.9 (424.4)
min < med < max:
1300 < 1600 < 16000
IQR (CV) : 200 (0.3)
13 distinct values 0
(0.0%)
14 Doors
[integer]
Mean (sd) : 4 (1)
min < med < max:
2 < 4 < 5
IQR (CV) : 2 (0.2)
2 : 2 ( 0.1%)
3 : 622 (43.3%)
4 : 138 ( 9.6%)
5 : 674 (46.9%)
0
(0.0%)
15 Cylinders
[integer]
1 distinct value 4 : 1436 (100.0%) 0
(0.0%)
16 Gears
[integer]
Mean (sd) : 5 (0.2)
min < med < max:
3 < 5 < 6
IQR (CV) : 0 (0)
3 : 2 ( 0.1%)
4 : 1 ( 0.1%)
5 : 1390 (96.8%)
6 : 43 ( 3.0%)
0
(0.0%)
17 Quarterly_Tax
[integer]
Mean (sd) : 87.1 (41.1)
min < med < max:
19 < 85 < 283
IQR (CV) : 16 (0.5)
13 distinct values 0
(0.0%)
18 Weight
[integer]
Mean (sd) : 1072.5 (52.6)
min < med < max:
1000 < 1070 < 1615
IQR (CV) : 45 (0)
59 distinct values 0
(0.0%)
19 Mfr_Guarantee
[integer]
Min : 0
Mean : 0.4
Max : 1
0 : 848 (59.1%)
1 : 588 (40.9%)
0
(0.0%)
20 BOVAG_Guarantee
[integer]
Min : 0
Mean : 0.9
Max : 1
0 : 150 (10.4%)
1 : 1286 (89.6%)
0
(0.0%)
21 Guarantee_Period
[integer]
Mean (sd) : 3.8 (3)
min < med < max:
3 < 3 < 36
IQR (CV) : 0 (0.8)
3 : 1274 (88.7%)
6 : 77 ( 5.4%)
12 : 73 ( 5.1%)
13 : 1 ( 0.1%)
18 : 1 ( 0.1%)
20 : 1 ( 0.1%)
24 : 4 ( 0.3%)
28 : 1 ( 0.1%)
36 : 4 ( 0.3%)
0
(0.0%)
22 ABS
[integer]
Min : 0
Mean : 0.8
Max : 1
0 : 268 (18.7%)
1 : 1168 (81.3%)
0
(0.0%)
23 Airbag_1
[integer]
Min : 0
Mean : 1
Max : 1
0 : 42 ( 2.9%)
1 : 1394 (97.1%)
0
(0.0%)
24 Airbag_2
[integer]
Min : 0
Mean : 0.7
Max : 1
0 : 398 (27.7%)
1 : 1038 (72.3%)
0
(0.0%)
25 Airco
[integer]
Min : 0
Mean : 0.5
Max : 1
0 : 706 (49.2%)
1 : 730 (50.8%)
0
(0.0%)
26 Automatic_airco
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1355 (94.4%)
1 : 81 ( 5.6%)
0
(0.0%)
27 Boardcomputer
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1013 (70.5%)
1 : 423 (29.5%)
0
(0.0%)
28 CD_Player
[integer]
Min : 0
Mean : 0.2
Max : 1
0 : 1122 (78.1%)
1 : 314 (21.9%)
0
(0.0%)
29 Central_Lock
[integer]
Min : 0
Mean : 0.6
Max : 1
0 : 603 (42.0%)
1 : 833 (58.0%)
0
(0.0%)
30 Powered_Windows
[integer]
Min : 0
Mean : 0.6
Max : 1
0 : 629 (43.8%)
1 : 807 (56.2%)
0
(0.0%)
31 Power_Steering
[integer]
Min : 0
Mean : 1
Max : 1
0 : 32 ( 2.2%)
1 : 1404 (97.8%)
0
(0.0%)
32 Radio
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1226 (85.4%)
1 : 210 (14.6%)
0
(0.0%)
33 Mistlamps
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1067 (74.3%)
1 : 369 (25.7%)
0
(0.0%)
34 Sport_Model
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1005 (70.0%)
1 : 431 (30.0%)
0
(0.0%)
35 Backseat_Divider
[integer]
Min : 0
Mean : 0.8
Max : 1
0 : 330 (23.0%)
1 : 1106 (77.0%)
0
(0.0%)
36 Metallic_Rim
[integer]
Min : 0
Mean : 0.2
Max : 1
0 : 1142 (79.5%)
1 : 294 (20.5%)
0
(0.0%)
37 Radio_cassette
[integer]
Min : 0
Mean : 0.1
Max : 1
0 : 1227 (85.4%)
1 : 209 (14.6%)
0
(0.0%)
38 Parking_Assistant
[integer]
Min : 0
Mean : 0
Max : 1
0 : 1432 (99.7%)
1 : 4 ( 0.3%)
0
(0.0%)
39 Tow_Bar
[integer]
Min : 0
Mean : 0.3
Max : 1
0 : 1037 (72.2%)
1 : 399 (27.8%)
0
(0.0%)

Missing Variables Plot

# Missing Variables Plot for the Dataset
gg_miss_var(ToyotaDT, show_pct = TRUE)

We can see that there is no missing values in our dataset ToyotaCorolla.csv

Ex 6.4

Predicting Prices of Used Cars

Split the data into training (50%), validation (30%), and test (20%) datasets.

# Setting Seed
set.seed(1)

# Splitting Training and Validation and Test
splitting <- sample(1:3,size=nrow(ToyotaDT),replace=TRUE,prob=c(0.5,0.3,0.2))
Training <- ToyotaDT[splitting==1,]
Validation <- ToyotaDT[splitting==2,]
Test <- ToyotaDT[splitting==3,]

# Checking if proportions are right
Prop_Training <- (nrow(Training)/nrow(ToyotaDT))*100
Prop_Validation <- (nrow(Validation)/nrow(ToyotaDT))*100
Prop_Test <- (nrow(Test)/nrow(ToyotaDT))*100

# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")

[1] “The Proportions are: 52.58 % In Training, 27.79 % In Validation, and 19.64 % In Test”


Run a multiple linear regression

with the outcome variable Price and predictor variables Age_08, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.


Outcome Variable

Numerical: Price


Explanatory Variables

Numerical: Age_08, KM, HP, Doors, Quarterly_Tax, Quarantee_Period

Categorical/Dummy: Fuel Type, Automatic, Mfr_Guarantee, Airco, Automatic_airco, CD_Player, Powered Windows, Sport_Model, Tow_Bar


Running the Linear Regression

# Linear OLS Regression on Training 
Regression_Price <- lm(Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar, data = Training)

# Scientific Notation
options(scipen = 999)

summary(Regression_Price)
FALSE 
FALSE Call:
FALSE lm(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + 
FALSE     Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + 
FALSE     Airco + Automatic_airco + CD_Player + Powered_Windows + Sport_Model + 
FALSE     Tow_Bar, data = Training)
FALSE 
FALSE Residuals:
FALSE     Min      1Q  Median      3Q     Max 
FALSE -4605.0  -725.0    26.5   655.5  5567.0 
FALSE 
FALSE Coefficients:
FALSE                      Estimate   Std. Error t value             Pr(>|t|)    
FALSE (Intercept)      10149.303878   753.570928  13.468 < 0.0000000000000002 ***
FALSE Age_08_04         -106.254953     3.648627 -29.122 < 0.0000000000000002 ***
FALSE KM                  -0.018110     0.001693 -10.697 < 0.0000000000000002 ***
FALSE Fuel_TypeDiesel   1775.761977   509.701393   3.484             0.000523 ***
FALSE Fuel_TypePetrol   2087.180327   530.110397   3.937   0.0000902216317796 ***
FALSE HP                  26.355347     3.846774   6.851   0.0000000000154436 ***
FALSE Automatic          605.249517   178.506754   3.391             0.000734 ***
FALSE Doors              153.627129    46.146978   3.329             0.000915 ***
FALSE Quarterly_Tax       16.814925     2.081831   8.077   0.0000000000000027 ***
FALSE Mfr_Guarantee      125.684485    91.024994   1.381             0.167768    
FALSE Guarantee_Period    72.439768    15.661721   4.625   0.0000044171894508 ***
FALSE Airco              175.868563   106.848246   1.646             0.100197    
FALSE Automatic_airco   3178.052845   209.461203  15.173 < 0.0000000000000002 ***
FALSE CD_Player          247.920668   120.057343   2.065             0.039270 *  
FALSE Powered_Windows    426.529261   103.224416   4.132   0.0000400718888028 ***
FALSE Sport_Model        268.489425    98.070812   2.738             0.006336 ** 
FALSE Tow_Bar           -250.505084    97.621305  -2.566             0.010481 *  
FALSE ---
FALSE Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
FALSE 
FALSE Residual standard error: 1155 on 738 degrees of freedom
FALSE Multiple R-squared:  0.8915,  Adjusted R-squared:  0.8891 
FALSE F-statistic: 378.8 on 16 and 738 DF,  p-value: < 0.00000000000000022

Taking into account all the requested variables in our linear regression without further analysis or variable selection, we can observe a fairly high Multiple R-squared, close to 0.8915 and a fit of 0.8891. We have a very significant F-statistic, close to 0 which means that this model is already quite complex and better than a naive model including only the intercept. The most significant variables are the Age of the car, the number of kilometres (KM) and the automatic air conditioning (Automatic_airco), since all have the smallest p-value.


Show Models

# show models
sum$which
FALSE    (Intercept) Age_08_04    KM Fuel_TypeCNG Fuel_TypeDiesel Fuel_TypePetrol
FALSE 1         TRUE      TRUE FALSE        FALSE           FALSE           FALSE
FALSE 2         TRUE      TRUE FALSE        FALSE           FALSE           FALSE
FALSE 3         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 4         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 5         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 6         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 7         TRUE      TRUE  TRUE        FALSE           FALSE           FALSE
FALSE 8         TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 9         TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 10        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 11        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 12        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 13        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 14        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 15        TRUE      TRUE  TRUE         TRUE           FALSE           FALSE
FALSE 16        TRUE      TRUE  TRUE         TRUE            TRUE           FALSE
FALSE       HP Automatic Doors Quarterly_Tax Mfr_Guarantee Guarantee_Period Airco
FALSE 1  FALSE     FALSE FALSE         FALSE         FALSE            FALSE FALSE
FALSE 2  FALSE     FALSE FALSE         FALSE         FALSE            FALSE FALSE
FALSE 3  FALSE     FALSE FALSE         FALSE         FALSE            FALSE FALSE
FALSE 4  FALSE     FALSE FALSE          TRUE         FALSE            FALSE FALSE
FALSE 5   TRUE     FALSE FALSE          TRUE         FALSE            FALSE FALSE
FALSE 6   TRUE     FALSE FALSE          TRUE         FALSE            FALSE FALSE
FALSE 7   TRUE     FALSE FALSE          TRUE         FALSE             TRUE FALSE
FALSE 8   TRUE     FALSE FALSE          TRUE         FALSE             TRUE FALSE
FALSE 9   TRUE      TRUE FALSE          TRUE         FALSE             TRUE FALSE
FALSE 10  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 11  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 12  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 13  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE FALSE
FALSE 14  TRUE      TRUE  TRUE          TRUE         FALSE             TRUE  TRUE
FALSE 15  TRUE      TRUE  TRUE          TRUE          TRUE             TRUE  TRUE
FALSE 16  TRUE      TRUE  TRUE          TRUE          TRUE             TRUE  TRUE
FALSE    Automatic_airco CD_Player Powered_Windows Sport_Model Tow_Bar
FALSE 1            FALSE     FALSE           FALSE       FALSE   FALSE
FALSE 2             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 3             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 4             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 5             TRUE     FALSE           FALSE       FALSE   FALSE
FALSE 6             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 7             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 8             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 9             TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 10            TRUE     FALSE            TRUE       FALSE   FALSE
FALSE 11            TRUE     FALSE            TRUE        TRUE   FALSE
FALSE 12            TRUE     FALSE            TRUE        TRUE    TRUE
FALSE 13            TRUE      TRUE            TRUE        TRUE    TRUE
FALSE 14            TRUE      TRUE            TRUE        TRUE    TRUE
FALSE 15            TRUE      TRUE            TRUE        TRUE    TRUE
FALSE 16            TRUE      TRUE            TRUE        TRUE    TRUE


R-Squared

# show metrics
sum$rsq
FALSE  [1] 0.7655313 0.8300988 0.8503445 0.8620935 0.8759748 0.8801229 0.8822341
FALSE  [8] 0.8844385 0.8861651 0.8874965 0.8888897 0.8897505 0.8905682 0.8909579
FALSE [15] 0.8913048 0.8914512

R-Squared keep increasing when we add more and more variables to the model, which is already expected since it doesn’t account for the number of parameter (no penalty)


Adjusted R-Squared

sum$adjr2
FALSE  [1] 0.7652199 0.8296470 0.8497467 0.8613580 0.8751469 0.8791614 0.8811305
FALSE  [8] 0.8831993 0.8847899 0.8859844 0.8872447 0.8879675 0.8886484 0.8888949
FALSE [15] 0.8890985 0.8890979

Adjusted R-Squared keep increasing until the 15th value, 0.8890985 which is slight higher than the last one (16th).


Cp

sum$cp
FALSE  [1] 840.94284 404.55680 269.09689 191.32625  99.07820  72.91427  60.58056
FALSE  [8]  47.61333  37.89066  30.85073  23.39186  19.54705  15.99520  15.34962
FALSE [15]  14.99434  16.00000


The closest Cp to predictors+1 is the last one (16th value) which is 16, close to 17 = p+1 = 16+1.

a. What appear to be the three or four most important car specifications for predicting the car’s price?

After the Exhaustive Search and Popular Subset Selection Algorithms, we should have those 3-4 most important variable for predicting car’s price: Age, KM, Automatic_airco and Quarterly_Tax. Depending if we decided to drop the variable Mfr_Guarante, we might get slight different results in predictions, but the most important variables for predictions should stay the same. The Following Performance Assessment keep the same full model.

b. Using metrics you consider usefull, assess the performance of the model in predicting prices.

Computing Prediction with the Regression Model on Validation - Accuracy

library(forecast)
# use predict() to make predictions on a new set.
car.lm.pred <- predict(Regression_Price, Validation)
options(scipen=999, digits = 0)
some.residuals <- Validation$Price[1:20] - car.lm.pred[1:20]
data.frame("Predicted" = car.lm.pred[1:20], "Actual" = Validation$Price[1:20],
"Residual" = some.residuals)
FALSE    Predicted Actual Residual
FALSE 1      15327  13950    -1377
FALSE 2      14920  18600     3680
FALSE 3      17033  21500     4467
FALSE 4      20592  19600     -992
FALSE 5      20061  22500     2439
FALSE 6      19881  22750     2869
FALSE 7      14134  16950     2816
FALSE 8      17294  15950    -1344
FALSE 9      14919  15750      831
FALSE 10     14604  15750     1146
FALSE 11     14910  15950     1040
FALSE 12     14286  15750     1464
FALSE 13     14478  16750     2272
FALSE 14     14107  13950     -157
FALSE 15     15345  16950     1605
FALSE 16     16061  16950      889
FALSE 17     19033  19000      -33
FALSE 18     16568  17950     1382
FALSE 19     19668  21950     2282
FALSE 20     14663  15250      587
options(scipen=999, digits = 3)
# use accuracy() to compute common accuracy measures.
accuracy(car.lm.pred, Validation$Price)
FALSE          ME RMSE MAE    MPE MAPE
FALSE Test set 83 1319 948 -0.789 8.87

Here the resulting metrics for our predictions errors. The closest to 0 the better.

Histogram of Validation Errors

library(forecast)
car.lm.pred <- predict(Regression_Price, Validation)
all.residuals <- Validation$Price - car.lm.pred
length(all.residuals[which(all.residuals > -1406 & all.residuals < 1406)])/400
FALSE [1] 0.802
hist(all.residuals, breaks = 25, xlab = "Residuals", main = "", col = "#1c6155")

We can the residuals of our predictions on the validation set. The Spread is clearly visible, between -2000 and 2000 and normally distributed. Some extremes values for our residuals appear on the far right,.

Ex 9.3

Predicting Prices of Used Cars (Regression Trees).

Data Preprocessing - 60% Training and 40% Validation

# Set Seed 
set.seed(1)

ToyotaDT_TREE <- fread("DATA/ToyotaCorolla.csv")

# Input Cutting Ratio
Prob_Train <- 0.6
Prob_Validation <- 1 - Prob_Train

# Training and Validation Set Splitting
sample <- sample(c(TRUE, FALSE), nrow(ToyotaDT_TREE), replace=TRUE, prob=c(Prob_Train,Prob_Validation))
Training_TREE  <- ToyotaDT_TREE[sample, ]
Validation_TREE   <- ToyotaDT_TREE[!sample, ]

# Proportions Check
Prop_Train <- nrow(Training_TREE)/nrow(ToyotaDT_TREE)*100
Prop_Valid <- nrow(Validation_TREE)/nrow(ToyotaDT_TREE)*100

# Printing Proportions for double checking
print(paste(round(Prop_Train,2),"% In Training", round(Prop_Valid,2),"% In Validation"))
FALSE [1] "61.42 % In Training 38.58 % In Validation"

a. Run a Regression Tree

with CP=0.001 and “ANOVA” Method

# Set Seed
set.seed(1)

# Regression Tree Packages
library(rpart)
library(rpart.plot)

# As Factor Fuel_Type
#ToyotaDT_TREE$Fuel_Type <- as.factor(ToyotaDT_TREE$Fuel_Type)

# Regression Tree Parameters
cp_1 = 0.001
method = "anova"
minbucket = 1
maxdepth = 30 #30 maximum

# Running Regression Tree
Tree_1 <- rpart(Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = Training_TREE, control = rpart.control(maxdepth = maxdepth, cp=cp_1,minbucket = minbucket, method=method))

# Plotting Regression Tree
prp(Tree_1,type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10,box.col=ifelse(Tree_1$frame$var == "<leaf>", 'gray', 'white'))

# Number of Leafs
Length_Tree_1 = length(Tree_1$frame$var[Tree_1$frame$var == "<leaf>"])

print(paste("There is", Length_Tree_1,"Number of Leaves"))
FALSE [1] "There is 31 Number of Leaves"

CP Plot

# Tree Size
plotcp(Tree_1)

With small CP, Our Regression Tree can go until 31 Leaves… But we see that the relative error seems to already be small enough between 9 and 13 Leaves.

i. Which appear to be the three or four most important car specifications for predicting the car’s price?

# Most 3 or 4 specifications for predictions in car's price
Tree_1
FALSE n= 882 
FALSE 
FALSE node), split, n, deviance, yval
FALSE       * denotes terminal node
FALSE 
FALSE   1) root 882 11000000000 10700  
FALSE     2) Age_08_04>=31.5 773  3070000000  9630  
FALSE       4) Age_08_04>=56.5 516   839000000  8670  
FALSE         8) Age_08_04>=68.5 248   283000000  7970  
FALSE          16) KM>=9.94e+04 71    91800000  7400  
FALSE            32) KM>=2e+05 5     3580000  5700 *
FALSE            33) KM< 2e+05 66    72600000  7530 *
FALSE          17) KM< 9.94e+04 177   160000000  8190 *
FALSE         9) Age_08_04< 68.5 268   316000000  9330  
FALSE          18) KM>=1.38e+05 11     9700000  7390 *
FALSE          19) KM< 1.38e+05 257   263000000  9410  
FALSE            38) Quarterly_Tax< 78.5 144   130000000  9070  
FALSE              76) KM>=4.23e+04 126   101000000  8940  
FALSE               152) Airco< 0.5 86    59000000  8730 *
FALSE               153) Airco>=0.5 40    30600000  9390 *
FALSE              77) KM< 4.23e+04 18    12700000  9950 *
FALSE            39) Quarterly_Tax>=78.5 113    94200000  9850  
FALSE              78) KM>=8.82e+04 34    17000000  9270 *
FALSE              79) KM< 8.82e+04 79    60600000 10100 *
FALSE       5) Age_08_04< 56.5 257   830000000 11500  
FALSE        10) KM>=1.28e+05 13    35200000  7040  
FALSE          20) Quarterly_Tax< 74.5 4     7930000  5320 *
FALSE          21) Quarterly_Tax>=74.5 9    10200000  7810 *
FALSE        11) KM< 1.28e+05 244   518000000 11800  
FALSE          22) Age_08_04>=44.5 119   210000000 11000  
FALSE            44) Airco< 0.5 57    56200000 10600  
FALSE              88) Mfr_Guarantee< 0.5 33    25500000 10200 *
FALSE              89) Mfr_Guarantee>=0.5 24    18300000 11100 *
FALSE            45) Airco>=0.5 62   132000000 11400  
FALSE              90) KM>=5.66e+04 41    35600000 11000 *
FALSE              91) KM< 5.66e+04 21    71700000 12300  
FALSE               182) Quarterly_Tax>=44 19    24300000 11900 *
FALSE               183) Quarterly_Tax< 44 2    18000000 16000 *
FALSE          23) Age_08_04< 44.5 125   172000000 12500  
FALSE            46) Powered_Windows< 0.5 49    58700000 12000 *
FALSE            47) Powered_Windows>=0.5 76    90100000 12800 *
FALSE     3) Age_08_04< 31.5 109   896000000 18200  
FALSE       6) HP< 113 100   506000000 17700  
FALSE        12) Age_08_04>=22.5 39   137000000 16400  
FALSE          24) HP< 104 17    30100000 14900  
FALSE            48) Sport_Model< 0.5 6      929000 13500 *
FALSE            49) Sport_Model>=0.5 11     9230000 15700 *
FALSE          25) HP>=104 22    40000000 17600 *
FALSE        13) Age_08_04< 22.5 61   261000000 18600  
FALSE          26) HP< 104 30   120000000 17600  
FALSE            52) Quarterly_Tax< 222 24    26000000 17000  
FALSE             104) Age_08_04>=14 19    11500000 16600 *
FALSE             105) Age_08_04< 14 5      740000 18500 *
FALSE            53) Quarterly_Tax>=222 6    43700000 20200  
FALSE             106) KM>=4.43e+04 4     5370000 18500 *
FALSE             107) KM< 4.43e+04 2     3750000 23600 *
FALSE          27) HP>=104 31    91800000 19500  
FALSE            54) Quarterly_Tax< 44 5     6780000 17000 *
FALSE            55) Quarterly_Tax>=44 26    48000000 19900  
FALSE             110) Airco< 0.5 1           0 16400 *
FALSE             111) Airco>=0.5 25    34700000 20100 *
FALSE       7) HP>=113 9   113000000 23500  
FALSE        14) Quarterly_Tax< 258 8    22600000 22400  
FALSE          28) Age_08_04>=15 5     2890000 21300 *
FALSE          29) Age_08_04< 15 3     2670000 24300 *
FALSE        15) Quarterly_Tax>=258 1           0 32500 *
Tree_1_Importance <- as.data.frame(Tree_1$variable.importance)

DT::datatable(Tree_1_Importance)

Using our Regression Tree output, we can see that the first 4 split are based on Age only. Then the 4 most important variables are Age_08_04, KM, Quarterly_Tax and HP.

ii. RMS Error between Training and Validation

# Predictions of Training and Validation
Training_Predictions <- predict(Tree_1,Training_TREE)
Validation_Predictions <- predict(Tree_1, Validation_TREE)

# RMSE  ------------------------------------------------------------------------

# Training RMSE
RMSE_Training <- RMSE(Training_Predictions,Training_TREE$Price)

# Validation RMSE 
RMSE_Validation <- RMSE(Validation_Predictions, Validation_TREE$Price)

# All RMSE
BIND_Training_Validation <- cbind(RMSE_Training,RMSE_Validation)
RMSE_Dataframe <- as.data.frame(BIND_Training_Validation)

flextable(RMSE_Dataframe) %>% set_header_labels(RMSE_Dataframe, values = list(RMSE_Training="RMSE Training",RMSE_Validation="RMSE Validation")
  )

Computing Residuals Boxplots

# Computing Residuals --------------------------------------------------------------
Residuals_Training_Tree_1 <- Training_TREE$Price - Training_Predictions
Residuals_Validation_Tree_1 <- Validation_TREE$Price - Validation_Predictions

Residuals_Training_Tree_1 <- as.data.frame(Residuals_Training_Tree_1)
Residuals_Validation_Tree_1 <- as.data.frame(Residuals_Validation_Tree_1)

# Boxplots -------------------------------------------------------------------------

library(ggplot2)

# Boxplot Training
box1 <- ggplotly(ggplot(Residuals_Training_Tree_1) +
 aes(x = "", y = Residuals_Training_Tree_1) +
 geom_boxplot(fill = "#1c6155") +
 labs(x = "with 882 observations", y = "Price Residuals", title = "Residuals Between Training and Predicted", subtitle = "Boxplot", 
 caption = "") +
 theme_minimal() + ylim(-10000, 10000) + theme(text = element_text(size = 8)))

# Boxplot Validation
box2 <- ggplotly(ggplot(Residuals_Validation_Tree_1) +
 aes(x = "", y = Residuals_Validation_Tree_1) +
 geom_boxplot(fill = "#1c6155") +
 labs(x = "with 554 observations", y = "Price Residuals", title = "Residuals Between Validation and Predicted", subtitle = "Boxplot", 
 caption = "") +
 theme_minimal() + ylim(-10000, 10000) + theme(text = element_text(size = 8)))

# Combine them
combineWidgets(box1,box2, ncol=2)

iii. How can we achieve predictions for the training set that are not equal to the actual prices?

iv. Prune the full tree using the cross-validation error. Compared to the full tree, what is the predictive performance for the validation set?

CP Table of our full Tree Regression - CP=0

cp_2 = 0

Tree_Full <- rpart(Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = Training_TREE, control = rpart.control(maxdepth = maxdepth, cp=cp_2,minbucket = minbucket, method=method))

Tree_Full$cptable
FALSE               CP nsplit rel error xerror   xstd
FALSE 1   0.6393510709      0   1.00000  1.001 0.0773
FALSE 2   0.1276629838      1   0.36065  0.363 0.0254
FALSE 3   0.0251430819      2   0.23299  0.236 0.0239
FALSE 4   0.0251186549      3   0.20784  0.235 0.0240
FALSE 5   0.0217957173      4   0.18272  0.222 0.0232
FALSE 6   0.0123622839      5   0.16093  0.187 0.0221
FALSE 7   0.0098137861      6   0.14857  0.171 0.0198
FALSE 8   0.0082355054      7   0.13875  0.173 0.0200
FALSE 9   0.0060825208      8   0.13052  0.170 0.0203
FALSE 10  0.0045230563      9   0.12443  0.171 0.0204
FALSE 11  0.0039175860     11   0.11539  0.154 0.0152
FALSE 12  0.0035459647     12   0.11147  0.150 0.0149
FALSE 13  0.0033636582     13   0.10792  0.150 0.0149
FALSE 14  0.0031435990     14   0.10456  0.151 0.0149
FALSE 15  0.0028697949     15   0.10142  0.151 0.0150
FALSE 16  0.0022990330     16   0.09855  0.147 0.0149
FALSE 17  0.0020864998     19   0.09165  0.144 0.0147
FALSE 18  0.0018104228     20   0.08956  0.142 0.0147
FALSE 19  0.0015475942     21   0.08775  0.141 0.0149
FALSE 20  0.0015463068     22   0.08621  0.140 0.0149
FALSE 21  0.0014997572     23   0.08466  0.140 0.0149
FALSE 22  0.0014630150     24   0.08316  0.139 0.0149
FALSE 23  0.0014145212     25   0.08170  0.139 0.0149
FALSE 24  0.0012538508     26   0.08028  0.138 0.0149
FALSE 25  0.0012111168     27   0.07903  0.135 0.0148
FALSE 26  0.0011188577     28   0.07782  0.137 0.0150
FALSE 27  0.0010518109     29   0.07670  0.132 0.0122
FALSE 28  0.0009784712     30   0.07565  0.130 0.0122
FALSE 29  0.0009668900     31   0.07467  0.130 0.0121
FALSE 30  0.0009122253     32   0.07370  0.130 0.0121
FALSE 31  0.0008104404     34   0.07188  0.129 0.0116
FALSE 32  0.0008072438     35   0.07107  0.126 0.0112
FALSE 33  0.0008068682     36   0.07026  0.126 0.0112
FALSE 34  0.0007661691     37   0.06945  0.126 0.0112
FALSE 35  0.0007637280     38   0.06869  0.126 0.0113
FALSE 36  0.0007305737     39   0.06792  0.126 0.0115
FALSE 37  0.0007121349     42   0.06570  0.126 0.0115
FALSE 38  0.0006485008     43   0.06499  0.124 0.0111
FALSE 39  0.0006383481     44   0.06434  0.125 0.0111
FALSE 40  0.0006048816     45   0.06370  0.125 0.0111
FALSE 41  0.0005769678     46   0.06310  0.125 0.0113
FALSE 42  0.0005630458     47   0.06252  0.124 0.0113
FALSE 43  0.0005448774     48   0.06196  0.125 0.0113
FALSE 44  0.0005378905     49   0.06141  0.126 0.0113
FALSE 45  0.0005215601     50   0.06088  0.128 0.0114
FALSE 46  0.0004954142     52   0.05983  0.128 0.0115
FALSE 47  0.0004886420     54   0.05884  0.129 0.0115
FALSE 48  0.0004759313     55   0.05835  0.129 0.0115
FALSE 49  0.0004709934     57   0.05740  0.129 0.0115
FALSE 50  0.0004692668     58   0.05693  0.128 0.0115
FALSE 51  0.0004660663     59   0.05646  0.128 0.0115
FALSE 52  0.0004622768     60   0.05600  0.131 0.0115
FALSE 53  0.0004380869     61   0.05553  0.131 0.0115
FALSE 54  0.0004321568     65   0.05378  0.134 0.0116
FALSE 55  0.0004269292     66   0.05335  0.134 0.0116
FALSE 56  0.0004198334     68   0.05250  0.135 0.0117
FALSE 57  0.0004019392     69   0.05208  0.136 0.0117
FALSE 58  0.0003661385     71   0.05127  0.138 0.0117
FALSE 59  0.0003630535     72   0.05091  0.138 0.0117
FALSE 60  0.0003612529     74   0.05018  0.138 0.0117
FALSE 61  0.0003612379     78   0.04873  0.138 0.0117
FALSE 62  0.0003608156     79   0.04837  0.138 0.0117
FALSE 63  0.0003605468     81   0.04765  0.138 0.0117
FALSE 64  0.0003525027     82   0.04729  0.138 0.0117
FALSE 65  0.0003448469     83   0.04694  0.137 0.0117
FALSE 66  0.0003426121     84   0.04659  0.138 0.0117
FALSE 67  0.0003410308     85   0.04625  0.138 0.0117
FALSE 68  0.0003368061     86   0.04591  0.137 0.0117
FALSE 69  0.0003309078     87   0.04557  0.138 0.0115
FALSE 70  0.0003238925     89   0.04491  0.139 0.0115
FALSE 71  0.0003192189     90   0.04459  0.141 0.0116
FALSE 72  0.0003117205     91   0.04427  0.141 0.0116
FALSE 73  0.0003080506     92   0.04395  0.145 0.0118
FALSE 74  0.0002983485     93   0.04365  0.145 0.0118
FALSE 75  0.0002969004    100   0.04156  0.145 0.0115
FALSE 76  0.0002884839    105   0.04001  0.146 0.0115
FALSE 77  0.0002831272    106   0.03972  0.148 0.0115
FALSE 78  0.0002774525    107   0.03944  0.149 0.0115
FALSE 79  0.0002769142    113   0.03778  0.149 0.0116
FALSE 80  0.0002743169    114   0.03750  0.149 0.0116
FALSE 81  0.0002671784    116   0.03695  0.147 0.0113
FALSE 82  0.0002609393    117   0.03668  0.148 0.0113
FALSE 83  0.0002583136    118   0.03642  0.148 0.0113
FALSE 84  0.0002482898    119   0.03616  0.149 0.0113
FALSE 85  0.0002479557    120   0.03592  0.148 0.0113
FALSE 86  0.0002476024    122   0.03542  0.148 0.0113
FALSE 87  0.0002462962    123   0.03517  0.149 0.0113
FALSE 88  0.0002430667    124   0.03493  0.150 0.0113
FALSE 89  0.0002429971    125   0.03468  0.150 0.0113
FALSE 90  0.0002425075    126   0.03444  0.150 0.0113
FALSE 91  0.0002421970    127   0.03420  0.150 0.0113
FALSE 92  0.0002372623    128   0.03396  0.150 0.0113
FALSE 93  0.0002317901    129   0.03372  0.150 0.0113
FALSE 94  0.0002316236    130   0.03349  0.150 0.0113
FALSE 95  0.0002271687    131   0.03325  0.150 0.0113
FALSE 96  0.0002226971    132   0.03303  0.149 0.0113
FALSE 97  0.0002216625    134   0.03258  0.150 0.0113
FALSE 98  0.0002210451    137   0.03192  0.150 0.0113
FALSE 99  0.0002178239    138   0.03170  0.151 0.0113
FALSE 100 0.0002165037    140   0.03126  0.151 0.0113
FALSE 101 0.0002112223    143   0.03053  0.152 0.0114
FALSE 102 0.0002066748    144   0.03032  0.154 0.0115
FALSE 103 0.0002055758    145   0.03011  0.154 0.0115
FALSE 104 0.0002044531    146   0.02991  0.154 0.0115
FALSE 105 0.0002035666    148   0.02950  0.154 0.0115
FALSE 106 0.0002002666    149   0.02930  0.154 0.0115
FALSE 107 0.0001983639    150   0.02910  0.155 0.0115
FALSE 108 0.0001944215    151   0.02890  0.155 0.0115
FALSE 109 0.0001942004    153   0.02851  0.155 0.0115
FALSE 110 0.0001939594    156   0.02793  0.155 0.0115
FALSE 111 0.0001912212    157   0.02773  0.155 0.0115
FALSE 112 0.0001880072    159   0.02735  0.155 0.0115
FALSE 113 0.0001848405    160   0.02716  0.156 0.0115
FALSE 114 0.0001844292    161   0.02698  0.156 0.0115
FALSE 115 0.0001834970    162   0.02679  0.156 0.0115
FALSE 116 0.0001822589    163   0.02661  0.156 0.0115
FALSE 117 0.0001818370    165   0.02624  0.156 0.0115
FALSE 118 0.0001785627    166   0.02606  0.156 0.0115
FALSE 119 0.0001756469    168   0.02571  0.156 0.0115
FALSE 120 0.0001754198    169   0.02553  0.157 0.0115
FALSE 121 0.0001705471    170   0.02535  0.157 0.0115
FALSE 122 0.0001703013    171   0.02518  0.157 0.0115
FALSE 123 0.0001673454    172   0.02501  0.157 0.0115
FALSE 124 0.0001673316    177   0.02416  0.157 0.0115
FALSE 125 0.0001661401    178   0.02399  0.157 0.0115
FALSE 126 0.0001657838    179   0.02383  0.157 0.0115
FALSE 127 0.0001599365    180   0.02366  0.157 0.0115
FALSE 128 0.0001598879    181   0.02350  0.158 0.0115
FALSE 129 0.0001593978    182   0.02334  0.158 0.0115
FALSE 130 0.0001536830    183   0.02318  0.158 0.0115
FALSE 131 0.0001526034    184   0.02303  0.158 0.0115
FALSE 132 0.0001512941    185   0.02288  0.158 0.0115
FALSE 133 0.0001498673    187   0.02257  0.159 0.0115
FALSE 134 0.0001478174    188   0.02242  0.159 0.0115
FALSE 135 0.0001406148    189   0.02228  0.159 0.0115
FALSE 136 0.0001388589    190   0.02213  0.160 0.0115
FALSE 137 0.0001366632    191   0.02200  0.160 0.0115
FALSE 138 0.0001362358    193   0.02172  0.160 0.0115
FALSE 139 0.0001359028    194   0.02159  0.160 0.0115
FALSE 140 0.0001351130    195   0.02145  0.160 0.0115
FALSE 141 0.0001326665    196   0.02132  0.160 0.0115
FALSE 142 0.0001306655    197   0.02118  0.160 0.0115
FALSE 143 0.0001298857    198   0.02105  0.160 0.0115
FALSE 144 0.0001288976    200   0.02079  0.160 0.0115
FALSE 145 0.0001276217    203   0.02041  0.160 0.0115
FALSE 146 0.0001266344    204   0.02028  0.160 0.0115
FALSE 147 0.0001251232    205   0.02015  0.160 0.0115
FALSE 148 0.0001242568    208   0.01978  0.160 0.0115
FALSE 149 0.0001222672    209   0.01965  0.162 0.0116
FALSE 150 0.0001201600    210   0.01953  0.161 0.0115
FALSE 151 0.0001195472    211   0.01941  0.161 0.0115
FALSE 152 0.0001186765    212   0.01929  0.161 0.0115
FALSE 153 0.0001121864    213   0.01917  0.162 0.0115
FALSE 154 0.0001108127    214   0.01906  0.163 0.0116
FALSE 155 0.0001103510    216   0.01884  0.162 0.0116
FALSE 156 0.0001091508    217   0.01873  0.163 0.0116
FALSE 157 0.0001090535    219   0.01851  0.163 0.0116
FALSE 158 0.0001084726    220   0.01840  0.163 0.0116
FALSE 159 0.0001080823    221   0.01829  0.163 0.0116
FALSE 160 0.0001079040    222   0.01818  0.163 0.0116
FALSE 161 0.0001064078    223   0.01807  0.163 0.0116
FALSE 162 0.0001062725    224   0.01797  0.163 0.0116
FALSE 163 0.0001049583    226   0.01776  0.163 0.0116
FALSE 164 0.0001046291    228   0.01755  0.163 0.0116
FALSE 165 0.0001001892    229   0.01744  0.163 0.0116
FALSE 166 0.0000999928    232   0.01714  0.164 0.0116
FALSE 167 0.0000987192    233   0.01704  0.164 0.0116
FALSE 168 0.0000984241    235   0.01684  0.164 0.0116
FALSE 169 0.0000953840    236   0.01674  0.164 0.0116
FALSE 170 0.0000953119    237   0.01665  0.164 0.0116
FALSE 171 0.0000951155    238   0.01655  0.164 0.0116
FALSE 172 0.0000951091    239   0.01646  0.164 0.0116
FALSE 173 0.0000946082    241   0.01627  0.164 0.0115
FALSE 174 0.0000931798    242   0.01617  0.164 0.0115
FALSE 175 0.0000922480    243   0.01608  0.164 0.0115
FALSE 176 0.0000919592    244   0.01599  0.164 0.0115
FALSE 177 0.0000912680    245   0.01590  0.164 0.0115
FALSE 178 0.0000899935    246   0.01581  0.164 0.0115
FALSE 179 0.0000896196    247   0.01572  0.164 0.0115
FALSE 180 0.0000896147    248   0.01563  0.164 0.0115
FALSE 181 0.0000894289    249   0.01554  0.164 0.0115
FALSE 182 0.0000885560    250   0.01545  0.165 0.0115
FALSE 183 0.0000875802    251   0.01536  0.165 0.0115
FALSE 184 0.0000858569    252   0.01527  0.165 0.0115
FALSE 185 0.0000853280    253   0.01518  0.166 0.0115
FALSE 186 0.0000852487    254   0.01510  0.166 0.0115
FALSE 187 0.0000850414    255   0.01501  0.166 0.0115
FALSE 188 0.0000849366    256   0.01493  0.166 0.0115
FALSE 189 0.0000839667    260   0.01459  0.166 0.0115
FALSE 190 0.0000824227    261   0.01451  0.166 0.0116
FALSE 191 0.0000799439    262   0.01442  0.166 0.0115
FALSE 192 0.0000792590    263   0.01434  0.166 0.0115
FALSE 193 0.0000789059    264   0.01426  0.166 0.0115
FALSE 194 0.0000782107    265   0.01419  0.165 0.0115
FALSE 195 0.0000781153    266   0.01411  0.165 0.0115
FALSE 196 0.0000773309    269   0.01386  0.166 0.0115
FALSE 197 0.0000772079    270   0.01379  0.166 0.0115
FALSE 198 0.0000770803    271   0.01371  0.166 0.0115
FALSE 199 0.0000767396    272   0.01363  0.166 0.0115
FALSE 200 0.0000766326    274   0.01348  0.166 0.0115
FALSE 201 0.0000756866    276   0.01332  0.166 0.0115
FALSE 202 0.0000751000    277   0.01325  0.166 0.0115
FALSE 203 0.0000744869    278   0.01317  0.166 0.0115
FALSE 204 0.0000739682    279   0.01310  0.166 0.0115
FALSE 205 0.0000708540    283   0.01280  0.166 0.0115
FALSE 206 0.0000708046    284   0.01273  0.166 0.0115
FALSE 207 0.0000697527    285   0.01266  0.166 0.0115
FALSE 208 0.0000695370    286   0.01259  0.167 0.0115
FALSE 209 0.0000687314    287   0.01252  0.166 0.0115
FALSE 210 0.0000680695    289   0.01238  0.167 0.0115
FALSE 211 0.0000674066    294   0.01204  0.167 0.0115
FALSE 212 0.0000671615    295   0.01198  0.167 0.0115
FALSE 213 0.0000667555    296   0.01191  0.167 0.0115
FALSE 214 0.0000658662    297   0.01184  0.168 0.0115
FALSE 215 0.0000657153    298   0.01178  0.168 0.0115
FALSE 216 0.0000651210    300   0.01165  0.168 0.0115
FALSE 217 0.0000636524    301   0.01158  0.168 0.0115
FALSE 218 0.0000621252    303   0.01145  0.168 0.0115
FALSE 219 0.0000605493    304   0.01139  0.168 0.0115
FALSE 220 0.0000593504    305   0.01133  0.168 0.0115
FALSE 221 0.0000588561    307   0.01121  0.168 0.0115
FALSE 222 0.0000588539    308   0.01115  0.168 0.0115
FALSE 223 0.0000572853    309   0.01109  0.168 0.0115
FALSE 224 0.0000570693    312   0.01092  0.168 0.0115
FALSE 225 0.0000570300    313   0.01087  0.168 0.0115
FALSE 226 0.0000559130    314   0.01081  0.168 0.0115
FALSE 227 0.0000551755    317   0.01064  0.168 0.0115
FALSE 228 0.0000549957    318   0.01059  0.168 0.0115
FALSE 229 0.0000545511    319   0.01053  0.168 0.0115
FALSE 230 0.0000534044    320   0.01048  0.169 0.0115
FALSE 231 0.0000531509    321   0.01042  0.168 0.0115
FALSE 232 0.0000531156    322   0.01037  0.168 0.0115
FALSE 233 0.0000516687    323   0.01032  0.168 0.0115
FALSE 234 0.0000515278    324   0.01026  0.168 0.0115
FALSE 235 0.0000509675    325   0.01021  0.168 0.0115
FALSE 236 0.0000495354    326   0.01016  0.169 0.0115
FALSE 237 0.0000492638    327   0.01011  0.169 0.0115
FALSE 238 0.0000492152    328   0.01006  0.169 0.0115
FALSE 239 0.0000490449    330   0.00996  0.169 0.0115
FALSE 240 0.0000488381    334   0.00977  0.169 0.0115
FALSE 241 0.0000482516    336   0.00967  0.169 0.0115
FALSE 242 0.0000472436    338   0.00957  0.169 0.0115
FALSE 243 0.0000463580    340   0.00948  0.169 0.0115
FALSE 244 0.0000450443    341   0.00943  0.169 0.0115
FALSE 245 0.0000445037    343   0.00934  0.169 0.0115
FALSE 246 0.0000437468    344   0.00930  0.170 0.0115
FALSE 247 0.0000437166    345   0.00926  0.170 0.0115
FALSE 248 0.0000436474    346   0.00921  0.170 0.0115
FALSE 249 0.0000436409    347   0.00917  0.170 0.0115
FALSE 250 0.0000432932    348   0.00912  0.170 0.0115
FALSE 251 0.0000431687    350   0.00904  0.169 0.0115
FALSE 252 0.0000424077    353   0.00891  0.170 0.0115
FALSE 253 0.0000417979    356   0.00878  0.170 0.0115
FALSE 254 0.0000412113    357   0.00874  0.170 0.0115
FALSE 255 0.0000397131    358   0.00870  0.170 0.0115
FALSE 256 0.0000395197    360   0.00862  0.170 0.0115
FALSE 257 0.0000389407    361   0.00858  0.170 0.0115
FALSE 258 0.0000387515    362   0.00854  0.170 0.0115
FALSE 259 0.0000380599    363   0.00850  0.170 0.0115
FALSE 260 0.0000369350    364   0.00846  0.170 0.0115
FALSE 261 0.0000363674    366   0.00839  0.170 0.0115
FALSE 262 0.0000360389    368   0.00832  0.170 0.0115
FALSE 263 0.0000360344    369   0.00828  0.170 0.0115
FALSE 264 0.0000360032    370   0.00824  0.170 0.0115
FALSE 265 0.0000349975    372   0.00817  0.170 0.0115
FALSE 266 0.0000347685    373   0.00814  0.170 0.0115
FALSE 267 0.0000340590    374   0.00810  0.170 0.0115
FALSE 268 0.0000336964    375   0.00807  0.170 0.0115
FALSE 269 0.0000336559    376   0.00804  0.170 0.0115
FALSE 270 0.0000318262    377   0.00800  0.171 0.0115
FALSE 271 0.0000318262    379   0.00794  0.171 0.0115
FALSE 272 0.0000318073    380   0.00791  0.171 0.0115
FALSE 273 0.0000311282    381   0.00787  0.171 0.0115
FALSE 274 0.0000305201    382   0.00784  0.171 0.0115
FALSE 275 0.0000302746    383   0.00781  0.171 0.0115
FALSE 276 0.0000296691    384   0.00778  0.171 0.0115
FALSE 277 0.0000291878    385   0.00775  0.171 0.0115
FALSE 278 0.0000288095    386   0.00772  0.171 0.0115
FALSE 279 0.0000281925    387   0.00769  0.171 0.0115
FALSE 280 0.0000279980    388   0.00767  0.171 0.0115
FALSE 281 0.0000274742    389   0.00764  0.171 0.0115
FALSE 282 0.0000273228    390   0.00761  0.171 0.0115
FALSE 283 0.0000272526    392   0.00756  0.171 0.0115
FALSE 284 0.0000272250    393   0.00753  0.171 0.0115
FALSE 285 0.0000269714    394   0.00750  0.171 0.0115
FALSE 286 0.0000259037    397   0.00742  0.171 0.0115
FALSE 287 0.0000256369    398   0.00739  0.172 0.0115
FALSE 288 0.0000255821    399   0.00737  0.172 0.0115
FALSE 289 0.0000255821    401   0.00732  0.172 0.0115
FALSE 290 0.0000254610    403   0.00727  0.172 0.0115
FALSE 291 0.0000252084    404   0.00724  0.172 0.0115
FALSE 292 0.0000250333    405   0.00722  0.172 0.0115
FALSE 293 0.0000248658    406   0.00719  0.172 0.0115
FALSE 294 0.0000245224    407   0.00717  0.172 0.0115
FALSE 295 0.0000243048    410   0.00709  0.172 0.0115
FALSE 296 0.0000240320    412   0.00704  0.172 0.0115
FALSE 297 0.0000240229    413   0.00702  0.172 0.0115
FALSE 298 0.0000238323    414   0.00700  0.172 0.0115
FALSE 299 0.0000236899    415   0.00697  0.172 0.0115
FALSE 300 0.0000236521    416   0.00695  0.172 0.0115
FALSE 301 0.0000233117    417   0.00693  0.172 0.0115
FALSE 302 0.0000227154    418   0.00690  0.172 0.0115
FALSE 303 0.0000224795    419   0.00688  0.172 0.0115
FALSE 304 0.0000222340    420   0.00686  0.172 0.0115
FALSE 305 0.0000222298    421   0.00683  0.172 0.0115
FALSE 306 0.0000218734    424   0.00677  0.172 0.0115
FALSE 307 0.0000218734    425   0.00675  0.172 0.0115
FALSE 308 0.0000217977    426   0.00672  0.172 0.0115
FALSE 309 0.0000211506    430   0.00664  0.172 0.0115
FALSE 310 0.0000207476    432   0.00659  0.173 0.0115
FALSE 311 0.0000206057    434   0.00655  0.173 0.0115
FALSE 312 0.0000204921    436   0.00651  0.173 0.0115
FALSE 313 0.0000200191    437   0.00649  0.173 0.0115
FALSE 314 0.0000193758    439   0.00645  0.173 0.0115
FALSE 315 0.0000183918    440   0.00643  0.173 0.0115
FALSE 316 0.0000183161    441   0.00641  0.173 0.0115
FALSE 317 0.0000183161    443   0.00638  0.173 0.0115
FALSE 318 0.0000174382    447   0.00630  0.173 0.0115
FALSE 319 0.0000170295    448   0.00629  0.173 0.0115
FALSE 320 0.0000166889    450   0.00625  0.173 0.0115
FALSE 321 0.0000164791    455   0.00617  0.173 0.0115
FALSE 322 0.0000164051    457   0.00614  0.173 0.0115
FALSE 323 0.0000160230    458   0.00612  0.173 0.0115
FALSE 324 0.0000156131    459   0.00610  0.173 0.0115
FALSE 325 0.0000151373    460   0.00609  0.173 0.0115
FALSE 326 0.0000151373    461   0.00607  0.173 0.0115
FALSE 327 0.0000147135    466   0.00600  0.173 0.0115
FALSE 328 0.0000146529    467   0.00598  0.173 0.0115
FALSE 329 0.0000145318    468   0.00597  0.173 0.0115
FALSE 330 0.0000139505    469   0.00595  0.173 0.0115
FALSE 331 0.0000137939    470   0.00594  0.173 0.0115
FALSE 332 0.0000136614    471   0.00593  0.173 0.0115
FALSE 333 0.0000136614    472   0.00591  0.173 0.0115
FALSE 334 0.0000135180    473   0.00590  0.173 0.0115
FALSE 335 0.0000134533    475   0.00587  0.173 0.0115
FALSE 336 0.0000123978    477   0.00584  0.173 0.0115
FALSE 337 0.0000123634    478   0.00583  0.173 0.0115
FALSE 338 0.0000122612    479   0.00582  0.173 0.0115
FALSE 339 0.0000112881    481   0.00579  0.173 0.0115
FALSE 340 0.0000111259    483   0.00577  0.173 0.0115
FALSE 341 0.0000109367    486   0.00574  0.173 0.0115
FALSE 342 0.0000109367    488   0.00572  0.173 0.0115
FALSE 343 0.0000103654    489   0.00571  0.173 0.0115
FALSE 344 0.0000103028    490   0.00570  0.173 0.0115
FALSE 345 0.0000099227    491   0.00569  0.173 0.0115
FALSE 346 0.0000096879    492   0.00568  0.173 0.0115
FALSE 347 0.0000096879    493   0.00567  0.173 0.0115
FALSE 348 0.0000091581    494   0.00566  0.173 0.0115
FALSE 349 0.0000090120    495   0.00565  0.173 0.0115
FALSE 350 0.0000088198    496   0.00564  0.173 0.0115
FALSE 351 0.0000085147    498   0.00562  0.173 0.0115
FALSE 352 0.0000084489    499   0.00561  0.173 0.0115
FALSE 353 0.0000084241    500   0.00560  0.173 0.0115
FALSE 354 0.0000081741    501   0.00559  0.173 0.0115
FALSE 355 0.0000074173    502   0.00559  0.173 0.0115
FALSE 356 0.0000074173    506   0.00556  0.173 0.0115
FALSE 357 0.0000072735    508   0.00554  0.173 0.0115
FALSE 358 0.0000068686    509   0.00553  0.173 0.0115
FALSE 359 0.0000067590    511   0.00552  0.173 0.0115
FALSE 360 0.0000063955    512   0.00551  0.173 0.0115
FALSE 361 0.0000056765    513   0.00551  0.173 0.0115
FALSE 362 0.0000054494    514   0.00550  0.173 0.0115
FALSE 363 0.0000050048    517   0.00549  0.173 0.0115
FALSE 364 0.0000046627    518   0.00548  0.173 0.0115
FALSE 365 0.0000045980    519   0.00548  0.173 0.0115
FALSE 366 0.0000044963    520   0.00547  0.173 0.0115
FALSE 367 0.0000043717    521   0.00547  0.173 0.0114
FALSE 368 0.0000037843    522   0.00546  0.173 0.0114
FALSE 369 0.0000036345    523   0.00546  0.173 0.0114
FALSE 370 0.0000036330    524   0.00546  0.173 0.0114
FALSE 371 0.0000033438    525   0.00545  0.173 0.0114
FALSE 372 0.0000030653    526   0.00545  0.173 0.0114
FALSE 373 0.0000029976    530   0.00544  0.173 0.0114
FALSE 374 0.0000029976    531   0.00543  0.173 0.0114
FALSE 375 0.0000029306    532   0.00543  0.173 0.0114
FALSE 376 0.0000027815    533   0.00543  0.173 0.0114
FALSE 377 0.0000024829    534   0.00542  0.173 0.0114
FALSE 378 0.0000024220    535   0.00542  0.173 0.0114
FALSE 379 0.0000024220    539   0.00541  0.173 0.0114
FALSE 380 0.0000024220    542   0.00540  0.173 0.0114
FALSE 381 0.0000022895    543   0.00540  0.173 0.0114
FALSE 382 0.0000020466    545   0.00540  0.173 0.0114
FALSE 383 0.0000020435    546   0.00540  0.173 0.0114
FALSE 384 0.0000019618    547   0.00539  0.173 0.0114
FALSE 385 0.0000018543    548   0.00539  0.173 0.0114
FALSE 386 0.0000018017    550   0.00539  0.173 0.0114
FALSE 387 0.0000013624    552   0.00538  0.173 0.0114
FALSE 388 0.0000012295    553   0.00538  0.173 0.0114
FALSE 389 0.0000010233    554   0.00538  0.173 0.0114
FALSE 390 0.0000009461    555   0.00538  0.173 0.0114
FALSE 391 0.0000009082    556   0.00538  0.173 0.0114
FALSE 392 0.0000006055    557   0.00538  0.173 0.0114
FALSE 393 0.0000003406    558   0.00538  0.173 0.0114
FALSE 394 0.0000001514    559   0.00538  0.173 0.0114
FALSE 395 0.0000001514    565   0.00538  0.173 0.0114
FALSE 396 0.0000001094    566   0.00538  0.173 0.0114
FALSE 397 0.0000000681    567   0.00538  0.173 0.0114
FALSE 398 0.0000000000    568   0.00538  0.173 0.0114

Prune Tree with optimal CP

Pruned_Tree <- prune(Tree_Full, cp = best_CP)

# Plotting Pruned Tree
prp(Pruned_Tree,type = 1, extra = 1, under = TRUE, split.font = 1, varlen = -10,box.col=ifelse(Pruned_Tree$frame$var == "<leaf>", 'gray', 'white'))

# Number of Leafs
Length_Tree_Pruned = length(Pruned_Tree$frame$var[Pruned_Tree$frame$var == "<leaf>"])

print(paste("There is", Length_Tree_Pruned,"Number of Leaves"))
FALSE [1] "There is 44 Number of Leaves"

Predictions Accuracy on Validation Set with Prune Tree

# Predicting
Predictions_Full_Tree <- predict(Tree_Full, Validation_TREE)
Predictions_Prune_Tree <- predict(Pruned_Tree, Validation_TREE)

# RMSE of Full and Prune Tree
RMSE_Full_Tree <- RMSE(Validation_TREE$Price, Predictions_Full_Tree)
RMSE_Prune_Tree <- RMSE(Validation_TREE$Price, Predictions_Prune_Tree)

# Data frame RMSE
RMSE_Full_Tree_2 <- cbind(RMSE_Full_Tree,RMSE_Prune_Tree,RMSE_Validation)
RMSE_Full_Tree_2 <- as.data.frame(RMSE_Full_Tree_2)

# Printing Table
flextable(RMSE_Full_Tree_2) %>% set_header_labels(RMSE_Full_Tree_2,RMSE_Full_Tree="RMSE Validation - Full Tree with CP=0",RMSE_Prune_Tree="RMSE Validation - Prune Tree with CP=0.0008", RMSE_Validation="RMSE Validation - First Tree with CP=0.001")

We can see that Prunning our Regression Tree helped us getting a smaller RMSE compared to the Full Tree, but with have slightly higher RMSE compared to the Regression Tree asked in the first questions with CP=0.001.

b. Price Variable into Categorical Variable

Here are the Binned_Price Variable into 20 levels

# Duplicating Training Data 
New_Training_Tree <- Training_TREE

# Cutting into 20 breaks the Price Variable
New_Training_Tree$Price <- cut(New_Training_Tree$Price, breaks = 20)

# Renaming Price -> Binned_Price
colnames(New_Training_Tree)[3] <- "Binned_Price"

# Printing Levels of Binned_Price
print(paste(levels(New_Training_Tree$Binned_Price)))
FALSE  [1] "(4.32e+03,5.76e+03]" "(5.76e+03,7.16e+03]" "(7.16e+03,8.57e+03]"
FALSE  [4] "(8.57e+03,9.98e+03]" "(9.98e+03,1.14e+04]" "(1.14e+04,1.28e+04]"
FALSE  [7] "(1.28e+04,1.42e+04]" "(1.42e+04,1.56e+04]" "(1.56e+04,1.7e+04]" 
FALSE [10] "(1.7e+04,1.84e+04]"  "(1.84e+04,1.98e+04]" "(1.98e+04,2.12e+04]"
FALSE [13] "(2.12e+04,2.26e+04]" "(2.26e+04,2.41e+04]" "(2.41e+04,2.55e+04]"
FALSE [16] "(2.55e+04,2.69e+04]" "(2.69e+04,2.83e+04]" "(2.83e+04,2.97e+04]"
FALSE [19] "(2.97e+04,3.11e+04]" "(3.11e+04,3.25e+04]"

i. Compare CT and RT. Are they different? (Structure, Top Predictors, Size of tree, etc.) Why?

Running Class Tree

# Running Class Tree
CT_Tree <- rpart(Binned_Price ~ Age_08_04+KM+Fuel_Type+HP+Automatic+Doors+Quarterly_Tax+Mfr_Guarantee+Guarantee_Period+Airco+Automatic+CD_Player+Powered_Windows+Sport_Model+Tow_Bar, data = New_Training_Tree, method = "class")

# Plotting Tree
CT_Tree_Plot <- rpart.plot(CT_Tree, type=0, varlen = 0, box.col=ifelse(CT_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, tweak = 0.7)

# Number of Leafs
Length_Tree_CT = length(CT_Tree$frame$var[CT_Tree$frame$var == "<leaf>"])
print(paste("There is", Length_Tree_CT,"Number of Leaves"))
FALSE [1] "There is 8 Number of Leaves"

We can see that this Class Tree is having 8 Terminal Nodes (or Leaves) and way smaller in depth compared to the regression Tree we used in the exercices before. There is up to 4 layers in this CT.

Importance of Variables in CT

# Most 3 or 4 specifications for predictions in car's price
CT_Tree_Importance <- as.data.frame(CT_Tree$variable.importance)

DT::datatable(CT_Tree_Importance)

Using our Class Tree output, we can see that the 4 most important variables are Age_08_04, KM, CD_Player and Airco

ii. Predict the price, using the RT and the CT, of a used Toyota Corolla with the following specifications.

# Dataframe Car Specifcations for Predictions 
Prediction_1_Car <- data.frame("Age_08_04"=77,"KM"=117000, "Fuel_Type"="Petrol","HP"=110,"Automatic"=0,"Doors"=5,"Quarterly_Tax"=100,"Mfr_Guarantee"=0,"Guarantee_Period"=3, "Airco"=1, "Automatic"=0, "CD_Player"=0, "Powered_Windows"=0, "Sport_Model"=0, "Tow_Bar"=1)

# Predicting Car Price with CT
Predicted_Car_Price <- predict(CT_Tree,Prediction_1_Car)

# Removing 0 Probabilities
Predicted_Car_Price <- Predicted_Car_Price[, colSums(Predicted_Car_Price != 0) > 0]

# As Dataframe for ggplotting
Predicted_Car_Price <- as.data.frame(Predicted_Car_Price)

# Renaming Column
colnames(Predicted_Car_Price) <- "Probabilities in %"

# Probabilities Rounding and %
Predicted_Car_Price$`Probabilities in %` <- Predicted_Car_Price$`Probabilities in %` * 100

Predicted_Car_Price$`Probabilities in %` <- round(Predicted_Car_Price$`Probabilities in %`,2)

# Flextable
DT::datatable(Predicted_Car_Price)

iii. Compare the predictions in terms of the predictors that were used, the magnitude of the difference between the two predictions, and the advantages and disadvantages of the two methods.

CT_Tree_Plot <- rpart.plot(CT_Tree, type=0, varlen = 0, box.col=ifelse(CT_Tree$frame$var == "<leaf>", '#8db0aa', 'white'), fallen.leaves = FALSE, extra = FALSE, tweak = 0.8)

By using the CT Tree, we first start with Age_08_04 >= 57 -> Yes, then Age_08_04 >=70 -> Yes, then KM >= 174000 -> No and we stop at (7.16e+03,8.57e+03] which is 53.57% probability for this specific car. We are using far less steps to get to our predictions compared to the Regression Tree. By using CT instead of RT, we are not getting an exact value of price, but instead a probability distribution of the different Price Bins our car could be in and thus selecting the highest probability would get our car’s price into a category. CT have less precisions in outcome predictions but have far less branches and terminal nodes, thus more easy for reading and understanding each variables importance in the prediction process, as well as giving probabilities, which can be useful depending on the context.

Ex 11.3

Car Sales

The Goal is to predict the price of a used Toyota Corolla based on its specifications.

a. Fit a neural network model to the data. Use a single hidden layer with 2 nodes.

Use predictors Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco, CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.

Splitting Training and Validation Set - 60% VS 40%

# Reproducible Results
set.seed(1)

# required Packages
library(neuralnet)
library(nnet)
library(caret)

# Load the Dataset with Fread()
ToyotaDT_NN <- fread("DATA/ToyotaCorolla.csv")

# Select the 15 variables 
ToyotaDT_NN <- ToyotaDT_NN[,c("Price","Age_08_04","KM","Fuel_Type","HP","Automatic","Doors","Quarterly_Tax","Mfr_Guarantee","Guarantee_Period","Airco","Automatic_airco","CD_Player","Powered_Windows","Sport_Model","Tow_Bar")]

# Input Cutting Ratio
probability_train <- 0.6
probability_test <- 1 - probability_train

# Training and Validation Set Splitting
sample <- sample(c(TRUE, FALSE), nrow(ToyotaDT_NN), replace=TRUE, prob=c(probability_train,probability_test))
Train_NN  <- ToyotaDT_NN[sample, ]
Test_NN   <- ToyotaDT_NN[!sample, ]

# Proportions Check
Prop_Training <- nrow(Train_NN)/nrow(ToyotaDT_NN)*100
Prop_Validation <- nrow(Test_NN)/nrow(ToyotaDT_NN)*100

# Preprocess Variables - 8 Dummies - Fuel Type to Dummy - 7 Numerical

## create dummies for fuel type
Fuel_Type <- as.data.frame(model.matrix(~ 0 + Fuel_Type, data=Train_NN))

## replace Fuel_Type column with 2 dummies
Train_NN <- cbind(Train_NN[,-c("Fuel_Type")], Fuel_Type[,])

## replace Fuel_Type column with 2 dummies
Test_NN <- cbind(Test_NN[,-c("Fuel_Type")], Fuel_Type[,])

## Numerical Processing with Training Set
preProcValues <- preProcess(Train_NN[,c("Price","Age_08_04","KM","HP","Doors","Quarterly_Tax","Guarantee_Period")], method = "range")

## Preprocess the Training Set
Train_NN_Preprocess <- predict(preProcValues, Train_NN)

## Preprocess the Test Set
Test_NN_Preprocess <- predict(preProcValues, Test_NN)

# Print Proportions

print(paste(round(Prop_Training,2),"% In Training", round(Prop_Validation,2),"% In Validation"))
FALSE [1] "61.42 % In Training 38.58 % In Validation"

Neural Network A - 1 Hidden Layers with 2 Nodes

# Reproducible Results
set.seed(1)

nn_1 <- neuralnet(Price ~ ., data = Train_NN_Preprocess, linear.output = T, hidden = 2)

plot(nn_1, rep="best")

Weights of the Neural Network A

# display weights
nn_1$weights
FALSE [[1]]
FALSE [[1]][[1]]
FALSE          [,1]   [,2]
FALSE  [1,] -0.4309  2.209
FALSE  [2,]  2.0122  3.652
FALSE  [3,]  0.8866 -1.037
FALSE  [4,] -0.7398 -1.433
FALSE  [5,] -0.1306  0.255
FALSE  [6,] -0.1019 -0.201
FALSE  [7,] -0.6180  1.546
FALSE  [8,] -0.0805 -0.176
FALSE  [9,]  0.1344 16.006
FALSE [10,] -0.0968 -0.293
FALSE [11,] -0.2474 35.270
FALSE [12,]  0.0789  1.118
FALSE [13,] -0.1134 -0.299
FALSE [14,]  0.0771  1.787
FALSE [15,]  0.1233  1.682
FALSE [16,]  0.7030 -1.676
FALSE [17,]  0.5789 -1.406
FALSE [18,]  0.7630  0.267
FALSE 
FALSE [[1]][[2]]
FALSE        [,1]
FALSE [1,] -0.365
FALSE [2,] -1.028
FALSE [3,]  1.419

Neural Network B - 1 Hidden Layers with 5 Nodes

# Reproducible Results
set.seed(1)

nn_2 <- neuralnet(Price ~ ., data = Train_NN_Preprocess, linear.output = T, hidden = 5)

plot(nn_2, rep="best")

Weights of the Neural Network B

# display weights
nn_2$weights
FALSE [[1]]
FALSE [[1]][[1]]
FALSE          [,1]     [,2]    [,3]   [,4]     [,5]
FALSE  [1,] -1.0822   0.2664 -0.7609  1.869  0.39221
FALSE  [2,] -1.6464  -0.6606 -1.4234  1.682 -1.92659
FALSE  [3,] -0.3819   0.6737 -0.8228 -0.520 -1.17504
FALSE  [4,]  1.1282  -0.0761  0.9746 -0.426  0.19693
FALSE  [5,] -0.8555  -5.9541  0.8139  2.845  0.15236
FALSE  [6,] -0.2748  -0.8334  0.4679  0.254  0.16805
FALSE  [7,]  0.0783  -4.9264  1.6873  2.500 -0.09956
FALSE  [8,]  0.4218   0.6210  0.2834 -0.246 -0.39992
FALSE  [9,]  1.3905 -35.2628 -1.0621 18.735 -0.72520
FALSE [10,]  0.0566   0.4204  0.0491 -0.165  0.08564
FALSE [11,]  1.2989   4.3655 -0.5550 -0.871  0.03334
FALSE [12,] -0.2900  -1.4439  1.0142  1.175 -0.82484
FALSE [13,] -0.1653   1.9985  0.1753 -1.415  0.40796
FALSE [14,] -0.3060 -91.3117  0.2948  2.309 -0.00232
FALSE [15,]  0.2535 -78.1617 -0.0750  2.528 -0.44375
FALSE [16,]  1.1113   1.3446 -1.7674 -1.909 -1.48382
FALSE [17,]  0.5031  -1.6160 -1.2026 -0.971 -0.16114
FALSE [18,]  0.4911  -1.0049 -1.5398 -0.259 -0.24136
FALSE 
FALSE [[1]][[2]]
FALSE        [,1]
FALSE [1,] -0.743
FALSE [2,]  0.439
FALSE [3,]  0.201
FALSE [4,]  0.414
FALSE [5,]  0.743
FALSE [6,]  0.496

Neural Network C - 2 Hidden Layers with 5 Nodes in each

# Reproducible Results
set.seed(1)

nn_3 <- neuralnet(Price ~ ., data = Train_NN_Preprocess, linear.output = T, hidden = c(5,5))

plot(nn_3, rep="best")

Weights of the Neural Network C

# display weights
nn_3$weights
FALSE [[1]]
FALSE [[1]][[1]]
FALSE          [,1]    [,2]   [,3]   [,4]     [,5]
FALSE  [1,]  1.0014  0.8207 -0.263 -1.308    1.154
FALSE  [2,]  2.6944 -3.8210  2.189 10.242   -2.117
FALSE  [3,]  1.9919 -1.8167  0.679 -1.140    0.783
FALSE  [4,]  2.6962  1.9304 -0.935 -5.662    9.611
FALSE  [5,] -2.0652 -0.1293 -0.492  1.182    0.806
FALSE  [6,]  0.0779 -0.6930 -1.155 -0.298   -0.300
FALSE  [7,] -5.3986  3.0127  1.366  4.488   12.672
FALSE  [8,]  2.3418  0.8618  0.378 -0.762    0.396
FALSE  [9,] -3.7351 -1.2264 -1.887 33.366   -3.809
FALSE [10,] -1.1707 -0.2756 -0.432 -1.375   -2.054
FALSE [11,] -5.5473 -0.0329 -0.332  4.946   -2.549
FALSE [12,]  1.0162  0.2500  0.318  3.204    6.706
FALSE [13,] -0.8608  0.4563  0.167  0.141   19.021
FALSE [14,] -4.3215 -0.0348  0.619  1.386   -1.194
FALSE [15,]  0.3951  0.1364  0.283  0.611    4.325
FALSE [16,] -2.9950 -3.3677 -0.303 -3.822 -341.739
FALSE [17,]  4.2604 -3.1110 -2.016 -5.497  -36.463
FALSE [18,]  0.9853 -0.4030 -0.868  0.543    0.255
FALSE 
FALSE [[1]][[2]]
FALSE        [,1]   [,2]   [,3]   [,4]   [,5]
FALSE [1,] -0.495 -1.264 -0.924  -3.26 -0.322
FALSE [2,]  1.039 -1.681  0.586  -2.20 -0.930
FALSE [3,] 36.661 -2.006 -0.832 -11.08  0.957
FALSE [4,]  0.554  0.297  2.981  48.03 -0.701
FALSE [5,]  0.937 -0.477  0.413   8.22  0.725
FALSE [6,]  0.718  2.243  1.117   1.29 -0.291
FALSE 
FALSE [[1]][[3]]
FALSE        [,1]
FALSE [1,] -0.624
FALSE [2,]  1.224
FALSE [3,] -0.286
FALSE [4,] -0.324
FALSE [5,] -0.247
FALSE [6,]  0.745

Predictions and RMSE

RMSE For Training Set

# Preprocess Scale - Range Method ----------------------------

# Predictions for Training -----------------------------------

# Predictions with nn
Train_Prediction_nn_1 <- predict(nn_1,Train_NN_Preprocess)

# Predictions with nn_2
Train_Prediction_nn_2 <- predict(nn_2,Train_NN_Preprocess)

# Predictions with nn_3
Train_Prediction_nn_3 <- predict(nn_3,Train_NN_Preprocess)

# Back transform to Original Scale ---------------------------

# Predictions with nn
Train_Prediction_nn_1 <- Train_Prediction_nn_1*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_2
Train_Prediction_nn_2 <- Train_Prediction_nn_2*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_3
Train_Prediction_nn_3 <- Train_Prediction_nn_3*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# RMSE for Train  -------------------------------------------------------

RMSE_Train_Prediction_nn_1 <- RMSE(Train_Prediction_nn_1,Train_NN$Price)
RMSE_Train_Prediction_nn_2 <- RMSE(Train_Prediction_nn_2,Train_NN$Price)
RMSE_Train_Prediction_nn_3 <- RMSE(Train_Prediction_nn_3,Train_NN$Price)

RMSE <- c(RMSE_Train_Prediction_nn_1,RMSE_Train_Prediction_nn_2,RMSE_Train_Prediction_nn_3)

# Rounding RMSE
RMSE <- round(RMSE,2)

# Adding Name to Model
Model <- c("Neural A: 1 Hidden Layer, 2 Nodes","Neural B: 1 Hidden Layer, 5 Nodes", "Neural C: 2 Hidden Layer, 5 Nodes")

Frame_RMSE <- cbind(Model,RMSE)
Frame_RMSE <- as.data.frame(Frame_RMSE)

RMSE_DATA <- flextable(Frame_RMSE) %>% fontsize(size = 8, part = "all")

(RMSE_DATA)

RMSE For Validation Set

# Predictions for Validation --------------------------------

# Predictions with nn
Validation_Prediction_nn_1 <- predict(nn_1,Test_NN_Preprocess)

# Predictions with nn_2
Validation_Prediction_nn_2 <- predict(nn_2,Test_NN_Preprocess)

# Predictions with nn_3
Validation_Prediction_nn_3 <- predict(nn_3,Test_NN_Preprocess)

# Back transform to Original Scale ---------------------------

# Predictions with nn
Validation_Prediction_nn_1 <- Validation_Prediction_nn_1*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_2
Validation_Prediction_nn_2 <- Validation_Prediction_nn_2*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# Predictions with nn_3
Validation_Prediction_nn_3 <- Validation_Prediction_nn_3*(max(Train_NN$Price)-min(Train_NN$Price))+min(Train_NN$Price)

# RMSE for Validation  -------------------------------------------------------

RMSE_Validation_Prediction_1 <- RMSE(Validation_Prediction_nn_1, Test_NN$Price)
RMSE_Validation_Prediction_2 <- RMSE(Validation_Prediction_nn_2, Test_NN$Price)
RMSE_Validation_Prediction_3 <- RMSE(Validation_Prediction_nn_3, Test_NN$Price)

RMSE_Validation <- c(RMSE_Validation_Prediction_1,RMSE_Validation_Prediction_2,RMSE_Validation_Prediction_3)

# Rounding RMSE
RMSE_Validation <- round(RMSE_Validation,2)

# Adding Name to Model
Model <- c("Neural A: 1 Hidden Layer, 2 Nodes","Neural B: 1 Hidden Layer, 5 Nodes", "Neural C: 2 Hidden Layer, 5 Nodes")

Frame_RMSE_Validation <- cbind(Model,RMSE_Validation)
Frame_RMSE_Validation <- as.data.frame(Frame_RMSE_Validation)

RMSE_VALIDATION_DATA <- flextable(Frame_RMSE_Validation) %>% fontsize(size = 8, part = "all")
RMSE_VALIDATION_DATA <- set_header_labels(RMSE_VALIDATION_DATA, RMSE_Validation = "RMSE")

(RMSE_VALIDATION_DATA)

i. What happens to the RMS error for the training data as the number of layers and nodes increases ?

RMSE tend to decrease the more when complexify our neural network (Model C in that case). This indicates we are overfitting more and more our model to the training dataset.

ii. What happens to the RMS error for the validation data?

The RMSE for the Validation Data is lower for the Neural Model B, and then it increases with the model C.

iii. Comment on the appropriate number of layers and nodes for this application.

Model A and B are close, but Model A is not in a overfitting situation, we should decide wether we prefer good predictions with Model B and more overfitting to the training dataset, or Model A which give less accurate result but with more stability in future predictions since it is not overfitting such as Model C. In this case, I would prefer the Model B giving more accurate results but more prone to instability, thus choosing 1 Hidden Layers with 5 Nodes.